SETUP

Load packages and paths

library(grid)
library(ggplot2)
library(plyr)
library(RColorBrewer)

Color Scheme

CMI Recommendations

cmi_main_blue = "#0071b2"
cmi_grey = "#929d9e"
cmi_light_blue = "#00c4d9"
cmi_pea_green = "#b5bf00"

cmi_rich_green = "#73933d"
cmi_rich_purple = "#8e7fac"
cmi_rich_red = "#d75920"
cmi_rich_blue = "#4c87a1"
cmi_rich_aqua = "#66c7c3"
cmi_rich_orange = "#eebf42"

cmi_vibrant_yellow = "#ffd457"
cmi_vibrant_orange = "#f58025"
cmi_vibrant_green = "#78a22f"
cmi_vibrant_garnet = "#e6006f"
cmi_vibrant_purple = "#9A4d9e"
cmi_vibrant_blue = "#19398a"

cmi_site_colors = c(cmi_vibrant_blue, cmi_rich_blue, cmi_vibrant_purple, cmi_vibrant_garnet, 
    cmi_rich_red, cmi_vibrant_orange, cmi_vibrant_yellow, cmi_vibrant_green)
cmi_site_colors_ramp = colorRampPalette(cmi_site_colors)

Load data

Read in the data and then some

# setwd('/home2/zarrar/projects/qc') setwd('~/zarrar/qc')
setwd("~/Dropbox/Research/cmi/qc")
df <- read.csv("corr.qc/qc_filt_epi_derivatives.csv")[, -1]
nsites <- length(unique(df$site))

Percentiles

In our plots, we want to have percentile lines for each QC measure to indicate the distribution of each site relative to the whole sample

qc.measures <- colnames(df)[!(colnames(df) %in% c("subject", "site", "site.name", 
    "session", "scan", "global"))]
qvals <- c(0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99)
qcat <- c(1, 5, 25, 50, 25, 5, 1)
qline <- c(3, 2, 5, 1, 5, 2, 3)
qsize <- c(0.4, 0.25, 0.3, 0.25, 0.3, 0.25, 0.4)
qcols <- c("grey10", "grey10", "grey10", "grey50", "grey10", "grey10", "grey10")
# qcols <- brewer.pal(8, 'Dark2')[c(1,2,3,4,3,2,1)]

Now let's get the percentiles

percentiles <- apply(subset(df, select = qc.measures), 2, quantile, qvals, na.rm = TRUE)
percentiles <- as.data.frame(cbind(percentiles, qcat, qline, qsize))
percentiles$qline <- as.factor(qline)
percentiles$qcat <- as.factor(qcat)
print(percentiles)
##     falff_50 falff_75 falff_90 falff_fwhm falff_mean reho_50 reho_75
## 1%    0.3900   0.4347   0.4817      2.423     0.3971 0.05873 0.07763
## 5%    0.5032   0.5555   0.6113      2.741     0.5099 0.06914 0.09768
## 25%   0.6149   0.6561   0.7016      3.380     0.6204 0.09524 0.14052
## 50%   0.6423   0.6932   0.7475      4.163     0.6508 0.11584 0.17263
## 75%   0.6850   0.7344   0.7837      4.826     0.6895 0.15466 0.22570
## 95%   0.7497   0.7844   0.8191      5.690     0.7492 0.21801 0.31196
## 99%   0.7619   0.8014   0.8364      6.136     0.7614 0.25382 0.35066
##     reho_90 reho_fwhm reho_mean vmhc_50 vmhc_75 vmhc_90 vmhc_fwhm
## 1%   0.1051     5.817   0.06685  0.1776  0.3422  0.5338     5.393
## 5%   0.1349     8.109   0.08104  0.2355  0.4187  0.6079     5.803
## 25%  0.1929    10.299   0.11067  0.3217  0.5170  0.6864     6.255
## 50%  0.2337    11.586   0.13214  0.3919  0.5826  0.7375     6.465
## 75%  0.2977    12.695   0.17183  0.4566  0.6427  0.7841     6.649
## 95%  0.4010    13.833   0.23686  0.5579  0.7276  0.8455     6.900
## 99%  0.4430    15.088   0.26737  0.6078  0.7767  0.8755     7.081
##     vmhc_mean qcat qline qsize
## 1%     0.2136    1     3  0.40
## 5%     0.2625    5     2  0.25
## 25%    0.3293   25     5  0.30
## 50%    0.3801   50     1  0.25
## 75%    0.4260   25     5  0.30
## 95%    0.4957    5     2  0.25
## 99%    0.5303    1     3  0.40

Measure Descriptions

Associate a detailed description with each measure

dnames <- c("fALFF", "REHO", "VMHC")
mnames <- c("Median", "75th Percentile", "90th Percentile", "FWHM (mm)", "Mean")
descs <- expand.grid(m = mnames, d = dnames)
descs <- paste(descs[, 2], descs[, 1], sep = " - ")
mdf <- data.frame(measure = qc.measures, description = descs)
print(mdf)
##       measure             description
## 1    falff_50          fALFF - Median
## 2    falff_75 fALFF - 75th Percentile
## 3    falff_90 fALFF - 90th Percentile
## 4  falff_fwhm       fALFF - FWHM (mm)
## 5  falff_mean            fALFF - Mean
## 6     reho_50           REHO - Median
## 7     reho_75  REHO - 75th Percentile
## 8     reho_90  REHO - 90th Percentile
## 9   reho_fwhm        REHO - FWHM (mm)
## 10  reho_mean             REHO - Mean
## 11    vmhc_50           VMHC - Median
## 12    vmhc_75  VMHC - 75th Percentile
## 13    vmhc_90  VMHC - 90th Percentile
## 14  vmhc_fwhm        VMHC - FWHM (mm)
## 15  vmhc_mean             VMHC - Mean
cat("CHECK ABOVE...do the columns for each row match\n")
## CHECK ABOVE...do the columns for each row match

PLOTS

More Setup

This function will add percentile lines in the background plot: ggplot object pdf: percentile data frame

compile_percentiles <- function(pdf, measure, cols = NULL) {
    ret <- lapply(1:nrow(pdf), function(i) {
        p <- pdf[i, ]
        if (!is.null(cols)) {
            plot <- geom_hline(aes_string(yintercept = measure), data = p, size = as.numeric(p$qsize), 
                linetype = as.numeric(p$qline), color = cols[i])
            # as.character(p$qcolor[1])
        } else {
            plot <- geom_hline(aes_string(yintercept = measure), data = p, size = as.numeric(p$qsize[1]), 
                linetype = as.numeric(p$qline[1]), color = "grey50")
        }
        return(plot)
    })
    return(ret)
}

Outliers

Sometimes extreme data-points can skew the plot and make it difficult to see the spread of the data.

I will avoid plotting those outlier points by setting the axis to only the points that I want. ggplot will complain but let her (poor baby).

# functions
range.outlier.iqr <- function(x, times = 3) {
    upper.limit <- quantile(x, 0.75) + times * IQR(x)
    lower.limit <- quantile(x, 0.25) - times * IQR(x)
    return(c(lower.limit, upper.limit))
}
outlier.iqr <- function(x, times = 3) {
    tmp <- range.outlier.iqr(x, times)
    lower.limit <- tmp[1]
    upper.limit <- tmp[2]
    return((x > upper.limit) | (x < lower.limit))
}
# outlier values (if any)
lst.outlier.iqr <- llply(qc.measures, function(measure) {
    ret <- subset(df, select = c("subject", "site", "site.name", "session", 
        "scan", measure))
    inds <- outlier.iqr(df[[measure]])
    return(ret[inds, ])
})
names(lst.outlier.iqr) <- qc.measures
# new ranges of our plots (sans outliers)
df.range.iqr <- as.data.frame(sapply(qc.measures, function(m) {
    inds <- !outlier.iqr(df[[m]])
    range(df[[m]][inds]) * 1.1
}))

QC Derivative Measures

I will be plotting a bunch of different data-sets here. First I will have all the data, then I will have it remote 3 x the IQR. So two sets of the same plots.

REMOVE OUTLIERS

for (i in 1:nrow(mdf)) {
    measure <- as.character(mdf$measure[i])
    desc <- as.character(mdf$description[i])

    # ### Option 1 First, I'll plot ones with 1%, 5%, 25%, and 50% percentile
    # lines
    pg1 = ggplot(df, aes_string(x = "site.name", y = measure))

    # Add those percentile lines
    pg2 = pg1 + compile_percentiles(percentiles, measure, qcols)
    # pg2=pg1 p <- percentiles[1,] pg2=pg2 +
    # geom_hline(aes_string(yintercept=measure), data=p) p <- percentiles[2,]
    # pg2=pg2 + geom_hline(aes_string(yintercept=measure), data=p) p <-
    # percentiles[3,] pg2=pg2 + geom_hline(aes_string(yintercept=measure),
    # data=p)


    # Add main plot - violin plot + boxplot for all the data - jitter plot for
    # each site (adjust the color) - x and y labels
    pg3 = pg2 + geom_violin(aes(x = global), color = "gray50") + geom_boxplot(aes(x = global), 
        width = 0.1, fill = "gray50", outlier.size = 0) + geom_jitter(aes(color = site.name), 
        position = position_jitter(width = 0.1)) + scale_color_manual(values = c(brewer.pal(4, 
        "Dark2"), cmi_site_colors_ramp(nsites))) + ylab(desc) + xlab("")

    # Add the y-range limit and the outlier points on the maximum of the range
    # only if there are any outliers
    pg4 = pg3
    # if (nrow(lst.outlier.std[[measure]]) > 0) { pg4=pg4 +
    # geom_jitter(aes(x=site.name, y=new), data=lst.outlier.std[[measure]],
    # position = position_jitter(width = .1), shape=8) }
    pg4 = pg4 + ylim(df.range.iqr[[measure]])

    # Below assumes that you are doing this with a default axis (sites on x,
    # data on y)
    pg5 = pg4 + theme_bw() + theme(axis.title.x = element_text(family = "Times", 
        face = "plain", size = 12)) + theme(axis.title.y = element_text(family = "Times", 
        face = "plain", size = 12, angle = 90)) + theme(axis.text.x = element_text(family = "Times", 
        face = "plain", size = 10, vjust = 0.5, angle = 45)) + theme(axis.text.y = element_text(family = "Times", 
        face = "plain", size = 10, angle = 90)) + theme(axis.ticks.length = unit(0.15, 
        "lines")) + theme(axis.ticks.margin = unit(0.15, "lines")) + theme(plot.margin = unit(c(0.25, 
        0.25, 0.25, 0.25), "lines")) + theme(legend.position = "none")
    pg = pg5
    print(pg)

    # # Let's now redo the previous plot but with the coordinates flipped # This
    # also requires changing up the angle of the x-axis pg5=pg4 + theme_bw() +
    # theme(axis.title.x = element_text(family = 'Times', face = 'plain',
    # size=14)) + theme(axis.title.y = element_blank()) + theme(axis.text.x =
    # element_text(family = 'Times', face = 'plain', size=12)) +
    # theme(axis.text.y = element_text(family = 'Times', face = 'plain',
    # size=12, angle=0)) + theme(axis.ticks = element_blank()) +
    # theme(plot.margin = unit(c(0.25, 0.25,0.25,0.25), 'lines')) +
    # theme(legend.position = 'none') pg6=pg5 + coord_flip() pg=pg6 print(pg)

    # ggsave('plot_option02.png', pg, height=2.5, width=5)

    # readline('continue?')
    cat("\n\n\n\n")
}
## Warning: Removed 215 rows containing non-finite values (stat_ydensity).
## Warning: Removed 215 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 215 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 130 rows containing non-finite values (stat_ydensity).
## Warning: Removed 130 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 130 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 95 rows containing non-finite values (stat_ydensity).
## Warning: Removed 95 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 95 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 7 rows containing non-finite values (stat_ydensity).
## Warning: Removed 7 rows containing non-finite values (stat_boxplot).
## Warning: Removed 7 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 204 rows containing non-finite values (stat_ydensity).
## Warning: Removed 204 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 204 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 29 rows containing non-finite values (stat_ydensity).
## Warning: Removed 29 rows containing non-finite values (stat_boxplot).
## Warning: Removed 29 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 17 rows containing non-finite values (stat_ydensity).
## Warning: Removed 17 rows containing non-finite values (stat_boxplot).
## Warning: Removed 17 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 6 rows containing non-finite values (stat_ydensity).
## Warning: Removed 6 rows containing non-finite values (stat_boxplot).
## Warning: Removed 6 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 4 rows containing non-finite values (stat_ydensity).
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 4 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 18 rows containing non-finite values (stat_ydensity).
## Warning: Removed 18 rows containing non-finite values (stat_boxplot).
## Warning: Removed 18 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 2 rows containing non-finite values (stat_ydensity).
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 19 rows containing non-finite values (stat_ydensity).
## Warning: Removed 19 rows containing non-finite values (stat_boxplot).
## Warning: Removed 19 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 85 rows containing non-finite values (stat_ydensity).
## Warning: Removed 85 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 85 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 2 rows containing missing values (geom_point).

plot of chunk plot-qc-no-outliers

KEEP OUTLIERS

for (i in 1:nrow(mdf)) {
    measure <- as.character(mdf$measure[i])
    desc <- as.character(mdf$description[i])

    # ### Option 1 First, I'll plot ones with 1%, 5%, 25%, and 50% percentile
    # lines
    pg1 = ggplot(df, aes_string(x = "site.name", y = measure))

    # Add those percentile lines
    pg2 = pg1 + compile_percentiles(percentiles, measure, qcols)
    # pg2=pg1 p <- percentiles[1,] pg2=pg2 +
    # geom_hline(aes_string(yintercept=measure), data=p) p <- percentiles[2,]
    # pg2=pg2 + geom_hline(aes_string(yintercept=measure), data=p) p <-
    # percentiles[3,] pg2=pg2 + geom_hline(aes_string(yintercept=measure),
    # data=p)


    # Add main plot - violin plot + boxplot for all the data - jitter plot for
    # each site (adjust the color) - x and y labels
    pg3 = pg2 + geom_violin(aes(x = global), color = "gray50") + geom_boxplot(aes(x = global), 
        width = 0.1, fill = "gray50", outlier.size = 0) + geom_jitter(aes(color = site.name), 
        position = position_jitter(width = 0.1)) + scale_color_manual(values = c(brewer.pal(4, 
        "Dark2"), cmi_site_colors_ramp(nsites))) + ylab(desc) + xlab("")

    # Add the y-range limit and the outlier points on the maximum of the range
    # only if there are any outliers
    pg4 = pg3
    # if (nrow(lst.outlier.std[[measure]]) > 0) { pg4=pg4 +
    # geom_jitter(aes(x=site.name, y=new), data=lst.outlier.std[[measure]],
    # position = position_jitter(width = .1), shape=8) } pg4=pg4 +
    # ylim(df.range.iqr[[measure]])

    # Below assumes that you are doing this with a default axis (sites on x,
    # data on y)
    pg5 = pg4 + theme_bw() + theme(axis.title.x = element_text(family = "Times", 
        face = "plain", size = 12)) + theme(axis.title.y = element_text(family = "Times", 
        face = "plain", size = 12, angle = 90)) + theme(axis.text.x = element_text(family = "Times", 
        face = "plain", size = 10, vjust = 0.5, angle = 45)) + theme(axis.text.y = element_text(family = "Times", 
        face = "plain", size = 10, angle = 90)) + theme(axis.ticks.length = unit(0.15, 
        "lines")) + theme(axis.ticks.margin = unit(0.15, "lines")) + theme(plot.margin = unit(c(0.25, 
        0.25, 0.25, 0.25), "lines")) + theme(legend.position = "none")
    pg = pg5
    print(pg)

    # # Let's now redo the previous plot but with the coordinates flipped # This
    # also requires changing up the angle of the x-axis pg5=pg4 + theme_bw() +
    # theme(axis.title.x = element_text(family = 'Times', face = 'plain',
    # size=14)) + theme(axis.title.y = element_blank()) + theme(axis.text.x =
    # element_text(family = 'Times', face = 'plain', size=12)) +
    # theme(axis.text.y = element_text(family = 'Times', face = 'plain',
    # size=12, angle=0)) + theme(axis.ticks = element_blank()) +
    # theme(plot.margin = unit(c(0.25, 0.25,0.25,0.25), 'lines')) +
    # theme(legend.position = 'none') pg6=pg5 + coord_flip() pg=pg6 print(pg)

    # ggsave('plot_option02.png', pg, height=2.5, width=5)

    # readline('continue?')
    cat("\n\n\n\n")
}

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

plot of chunk plot-qc-yes-outliers

## Warning: Removed 1 rows containing non-finite values (stat_ydensity).
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

plot of chunk plot-qc-yes-outliers